View Javadoc
1 /*** 2 * Title: S/MIME Project 3 * Description: Creating S/MIME email transport capabilities. 4 * Copyright: Copyright (c) 2001 5 * @Author Vladimir Radisic 6 * @Version 2.0.1 7 */ 8 9 package org.webdocwf.util.smime.util; 10 11 12 import org.webdocwf.util.smime.exception.SMIMEException; 13 import java.util.Vector; 14 import java.net.MalformedURLException; 15 import java.net.URL; 16 import java.io.InputStream; 17 import java.io.ByteArrayOutputStream; 18 import java.io.File; 19 import org.w3c.dom.Attr; 20 import org.w3c.dom.Document; 21 import org.w3c.dom.NamedNodeMap; 22 import org.w3c.dom.Node; 23 import org.w3c.dom.NodeList; 24 import org.w3c.tidy.Tidy; 25 26 27 /*** 28 * HtmlAnalyzer class is used for parsing html code which has to become content 29 * of the message. For parsing is used JTidy parser. As result of parsing, DOM 30 * (Document Object Model) structure is obtained. It is tree-like construction 31 * with nodes and hierarchical structures that descript input html code. This 32 * structure is easy for browsing and searching for specific html elements and 33 * attributes. By using DOM, all references to resources (image, movie, sound... ), 34 * defined in "src" and "background" attributes, are explored and swapped with 35 * generated unique Content-ID values which are necessary in forming 36 * "multipart/related" MimeMultipart object.<BR> 37 * <BR> 38 * DOM, generated inside of the object of this class, is also used in the process of 39 * generation plain/text message based on, and derived from the given html code. 40 * This plain text is later used in creation of "multipart/alternative" 41 * MimeMultipart object. 42 */ 43 public class HtmlAnalyzer { 44 45 /*** 46 * plain/text representation of page 47 */ 48 private String plainText = ""; 49 50 /*** 51 * Enable/disable p tag in text/html to text/plain conversion. 52 */ 53 private boolean pTagEnable = true; 54 55 /*** 56 * Path to html file or prefix path to the embeded resource's adresses in 57 * html code (for example for "src" attribute of IMG tag). Can be null which 58 * means that prefix won't be added to resources location in the process of 59 * searching for specific adress attributes given in html code. 60 */ 61 private String absolutPath = null; 62 63 /*** 64 * Container for parsed html document in DOM (Document Object Model) 65 * representation. 66 */ 67 private Document doc; 68 69 /*** 70 * Indent from left margin pointer. This information is used in the process of 71 * generation plain text message based on html code. 72 */ 73 private int indent = 0; 74 75 /*** 76 * Current sequential number of OL (ordered list) html element. This information 77 * is used in the process of generation plain text message based on html code. 78 */ 79 private int olNumber = 1; 80 81 /*** 82 * Current html element is OL (ordered list), UN (unordered list) or something 83 * else. This information is used in the process of generation plain text message based 84 * on html code. 85 */ 86 private String ul_ol = ""; 87 88 /*** 89 * Constant used in generating indent from left side. This information is used in 90 * the process of generation plain text message based on html code. 91 */ 92 private final String indentString = 93 "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"; 94 95 /*** 96 * Container for storing pairs of replaced url or file addresses and 97 * corresponding generated Content-ID values. 98 */ 99 private Vector sourceLinks = new Vector(0, 1); 100 101 /*** 102 * Enable/disable swapping resource references in html code with generated 103 * value for Content-ID message bodypart header line. Default value is true 104 * (enable swapping) 105 */ 106 private boolean enableSwapping = true; 107 108 /*** 109 * Constructs HtmlAnalyzer from data given from InputStream. This constructor 110 * parses html code from input stream withouth swaping resources' locations from 111 * atribute's "src" and "background" value with generated Content-ID values. Also, 112 * it is performed generation of plain text message based on html code. 113 * @param content0 html code given as InputStream 114 * @SMIMEException caused by its private method analyze(). 115 */ 116 public HtmlAnalyzer(InputStream content0) throws SMIMEException { 117 Tidy tidy = new Tidy(); 118 119 tidy.setWraplen(1000); 120 tidy.setShowWarnings(false); 121 tidy.setUpperCaseTags(true); 122 doc = (tidy.parseDOM(content0, null)); 123 enableSwapping = false; 124 analyze(doc); 125 plainText = plainText + "\r\n"; 126 } 127 128 /*** 129 * Constructs HtmlAnalyzer from data given from InputStream. This constructor 130 * parses html code from input stream with swaping resources' locations from 131 * atribute's "src and "background" value with generated Content-ID values. In 132 * that process, it is used given second paremeter "path0" which represents 133 * common path to all resources in html code with relative path adresses. Also, 134 * it is performed generation of plain text message based on html code. 135 * @param content0 html code given as InputStream. 136 * @param path0 common path used for resolving all resources in html code with 137 * relative path adresses. 138 * @SMIMEException caused by its private method analyze(). 139 */ 140 public HtmlAnalyzer(InputStream content0, String path0) throws SMIMEException { 141 if (path0 != null) { 142 absolutPath = new String(path0); 143 if (absolutPath.charAt(absolutPath.length() - 1) == '//' || 144 absolutPath.charAt(absolutPath.length() - 1) == '/') 145 absolutPath = absolutPath.substring(0, absolutPath.length() - 1); 146 147 absolutPath = absolutPath.replace('/', File.separatorChar); 148 absolutPath = absolutPath.replace('//', File.separatorChar) + File.separator; 149 } 150 151 Tidy tidy = new Tidy(); 152 153 tidy.setWraplen(1000); 154 tidy.setShowWarnings(false); 155 tidy.setUpperCaseTags(true); 156 doc = (tidy.parseDOM(content0, null)); 157 analyze(doc); 158 plainText = plainText + "\r\n"; 159 } 160 161 /*** 162 * Returns pairs of swapped resource URL adresses or File paths and appropriate 163 * generated Content IDs. 164 * @return Vector object whose even (and 0) indexes contain resource addresses 165 * as File or String objects, and whose odd indexes contain appropriate 166 * swapped Content-ID values. 167 */ 168 public Vector getSwappedAdresses() { 169 return sourceLinks; 170 } 171 172 /*** 173 * Returns plain/text representation of given html code document 174 * @return html document transformed to plain/text. 175 */ 176 public String getPlainText() { 177 return plainText; 178 } 179 180 /*** 181 * Returns html/text document passed throught JTidy html parser. All resource 182 * references which were accessible on the file system are swapped with 183 * generated content ID value. Also, all virtual references to appropriate 184 * InputStream resources (see setContent methods in classes from package 185 * org.webdocwf.util.smime.smime) are also swapped with generated Content-ID 186 * value. 187 * @return parsed html/text document. 188 * @exception SMIMEException caused by non SMIMEException which is: 189 * UnsupportedEncodingException. 190 */ 191 public String getHtmlText() throws SMIMEException { 192 String returnString; 193 194 Tidy tidy = new Tidy(); 195 196 tidy.setWraplen(1000); 197 ByteArrayOutputStream out = new ByteArrayOutputStream(); 198 199 tidy.pprint(doc, out); 200 201 try { 202 returnString = out.toString("ISO-8859-1"); 203 out.close(); 204 } catch (Exception e) { 205 throw SMIMEException.getInstance(this, e, "getHtmlText"); 206 } 207 208 return returnString; 209 } 210 211 /*** 212 * Analyzes html code and creates alternate plain/text message from html code. 213 * Also, it creates Vector with corresponding pairs of resource locations discovered 214 * in html code (values of "background" and "src" attributes) and generated 215 * Content-ID values. 216 * @param node0 node element got from JTidy parser. 217 * @exception SMIMEException caused by MimeAssist.generateID() method or by 218 * its private method existenceOfResource(). 219 */ 220 private void analyze(Node node0) throws SMIMEException { 221 222 if (node0 == null) { 223 return; 224 } 225 String brLine = "\r\n"; 226 int type = node0.getNodeType(); 227 228 boolean pTagEnable_old = true; 229 int indent_old = 0; 230 int olNumber_old = 1; 231 String ul_ol_old = ""; 232 233 switch (type) { 234 case Node.DOCUMENT_NODE: // Document node 235 analyze(((Document) node0).getDocumentElement()); 236 break; 237 238 case Node.ELEMENT_NODE: // Element node 239 String elName = node0.getNodeName(); 240 241 if (elName.equalsIgnoreCase("br")) { 242 plainText = plainText + brLine; 243 if (indent > 0) 244 plainText = plainText + 245 indentString.substring(0, indent - 1); 246 } else if (elName.equalsIgnoreCase("hr")) { 247 plainText = plainText + brLine + 248 "==================================================" + 249 brLine; 250 } else if (elName.equalsIgnoreCase("p")) { 251 if (pTagEnable) { 252 plainText = plainText + brLine + brLine; 253 if (indent > 0) 254 plainText = plainText + 255 indentString.substring(0, indent - 1); 256 } 257 pTagEnable = true; 258 } else if (elName.equalsIgnoreCase("ul")) { 259 pTagEnable_old = pTagEnable; 260 pTagEnable = false; 261 ul_ol_old = ul_ol; 262 ul_ol = elName; 263 indent_old = indent; 264 indent++; 265 } else if (elName.equalsIgnoreCase("ol")) { 266 pTagEnable_old = pTagEnable; 267 pTagEnable = false; 268 ul_ol_old = ul_ol; 269 ul_ol = elName; 270 indent_old = indent; 271 indent++; 272 olNumber_old = olNumber; 273 } else if (elName.equalsIgnoreCase("li")) { 274 pTagEnable = false; 275 if (ul_ol.equalsIgnoreCase("ul")) { 276 plainText = plainText + brLine + 277 indentString.substring(0, indent - 1) + 278 ">> "; 279 } else if (ul_ol.equalsIgnoreCase("ol")) { 280 plainText = plainText + brLine + 281 indentString.substring(0, indent - 1) + 282 olNumber + ". "; 283 olNumber++; 284 } 285 } else if (elName.equalsIgnoreCase("blockquote")) { 286 pTagEnable_old = pTagEnable; 287 pTagEnable = false; 288 indent_old = indent; 289 indent++; 290 plainText = plainText + brLine + 291 indentString.substring(0, indent); 292 } else if (elName.equalsIgnoreCase("q")) { 293 pTagEnable_old = pTagEnable; 294 pTagEnable = false; 295 plainText = plainText + "\""; 296 } else if (elName.equalsIgnoreCase("table")) { 297 plainText = plainText + brLine + 298 "**************************************************" + brLine + 299 "--------------------------------------------------" + brLine + 300 "-- -- -- -- -- -- -- -- -- -- -- -- --" + brLine; 301 } else if (elName.equalsIgnoreCase("tr")) { 302 plainText = plainText + brLine; 303 } else if (elName.equalsIgnoreCase("td")) { 304 plainText = plainText + brLine; 305 } 306 // attributes handling 307 NamedNodeMap attrs = node0.getAttributes(); 308 309 for (int i = 0; i < attrs.getLength(); i++) { 310 attrs.item(i).getNodeName().toUpperCase(); 311 if (enableSwapping && 312 ((attrs.item(i).getNodeName()).equalsIgnoreCase("src") || 313 (attrs.item(i).getNodeName()).equalsIgnoreCase("background"))) { 314 String resource = attrs.item(i).getNodeValue(); 315 String cid = null; 316 317 //******nnn<virtual_file_name> <-- resources got from byte array input stream 318 if (resource.substring(0, 5).equalsIgnoreCase("*****")) { 319 for (int j = 0; j < sourceLinks.size() & cid == null; j = j + 2) { 320 if (sourceLinks.elementAt(j) instanceof String && 321 ((String) sourceLinks.elementAt(j)).equals(resource)) 322 cid = (String) sourceLinks.elementAt(j + 1); 323 } 324 if (cid == null) { 325 cid = MimeAssist.generateID(); 326 sourceLinks.add(resource); 327 sourceLinks.add(cid); 328 } 329 attrs.item(i).setNodeValue("cid:" + cid); 330 } else { 331 File fRes = existenceOfResource(resource); 332 333 if (fRes != null) { 334 for (int j = 0; j < sourceLinks.size() & cid == null; j = j + 2) { 335 if (sourceLinks.elementAt(j) instanceof File && 336 ((File) sourceLinks.elementAt(j)).compareTo(fRes) == 0) 337 cid = (String) sourceLinks.elementAt(j + 1); 338 } 339 if (cid == null) { 340 cid = MimeAssist.generateID(); 341 sourceLinks.add(fRes); 342 sourceLinks.add(cid); 343 } 344 attrs.item(i).setNodeValue("cid:" + cid); 345 } 346 } 347 } 348 } 349 // finish of opening particular element tag 350 NodeList children = node0.getChildNodes(); //Passing through the node tree 351 352 if (children != null) { 353 int len = children.getLength(); 354 355 for (int i = 0; i < len; i++) { 356 analyze(children.item(i)); 357 } 358 } 359 // start of closing particular element tag 360 if (elName.equalsIgnoreCase("ul")) { 361 pTagEnable = pTagEnable_old; 362 ul_ol = ul_ol_old; 363 indent = indent_old; 364 } else if (elName.equalsIgnoreCase("ol")) { 365 pTagEnable = pTagEnable_old; 366 ul_ol = ul_ol_old; 367 indent = indent_old; 368 olNumber = olNumber_old; 369 } else if (elName.equalsIgnoreCase("table")) { 370 plainText = plainText + brLine + 371 "**************************************************"; 372 } else if (elName.equalsIgnoreCase("tr")) { 373 plainText = plainText + brLine + 374 "--------------------------------------------------"; 375 } else if (elName.equalsIgnoreCase("td")) { 376 plainText = plainText + brLine + 377 "-- -- -- -- -- -- -- -- -- -- -- -- --"; 378 } else if (elName.equalsIgnoreCase("blockquote")) { 379 indent = indent_old; 380 pTagEnable = pTagEnable_old; 381 } else if (elName.equalsIgnoreCase("q")) { 382 plainText = plainText + "\""; 383 pTagEnable = pTagEnable_old; 384 } 385 386 break; 387 388 case Node.TEXT_NODE: 389 String nodeVal = node0.getNodeValue(); 390 391 plainText = plainText + nodeVal; 392 break; 393 } 394 395 } 396 397 /** 398 * Method checks if it is given a resource reachable in the destination file system. 399 * @param resource0 can be absolute or relative path with specified file name 400 * or adress of file in URL form (example "file:///c:/temp/example.gif" ) 401 * @return object of class File which represents existance of the resource file 402 * or null if resource does not exist on the destination in file system. 403 * @SMIMEException caused by non SMIMEException which is IOException. 404 */ 405 private File existenceOfResource(String resource0) throws SMIMEException { 406 407 boolean resourceIsUrl = true; 408 String resource = new String(resource0); 409 URL url = null; 410 411 try { 412 url = new URL(resource0); 413 } catch (MalformedURLException e) { 414 resourceIsUrl = false; 415 } 416 417 if (resourceIsUrl == true && (!url.getProtocol().equalsIgnoreCase("file"))) 418 return null; 419 else if (resourceIsUrl == true && url.getProtocol().equalsIgnoreCase("file")) { 420 resource = url.getFile(); 421 } 422 423 resource = replaceHex(resource); 424 resource = resource.replace('/', File.separatorChar); 425 resource = resource.replace('//', File.separatorChar); 426 File fRes = new File(resource); 427 428 try { 429 if (fRes.exists()) 430 return fRes.getAbsoluteFile().getCanonicalFile(); 431 432 fRes = new File(absolutPath + resource); 433 if (fRes.exists()) 434 return fRes.getAbsoluteFile().getCanonicalFile(); 435 436 fRes = new File(absolutPath + resource); 437 if (fRes.exists()) 438 return fRes.getAbsoluteFile().getCanonicalFile(); 439 } catch (Exception e) { 440 throw SMIMEException.getInstance(this, e, "existenceOfResource"); 441 } 442 443 return null; 444 } 445 446 /*** 447 * Replaces possible hexadecimal representation of blank characters (presented 448 * with "%20") from resource String representation, with blank character. 449 * @param resources0 resource which is examined for hex representation of blank 450 * characters. 451 * @return String with replaced hexadecimal representation of blank characters. 452 */ 453 private String replaceHex(String resources0) { 454 while (resources0.indexOf("%20") != -1) { 455 resources0 = resources0.substring(0, resources0.indexOf("%20")) + " " + 456 resources0.substring(resources0.indexOf("%20") + 3); 457 } 458 return resources0; 459 } 460 461 }

This page was automatically generated by Maven